---
title: "alive_thrive"
author: "Sherry Hou"
date: "6/22/2020"
output: html_document
---
# Purpose: This program cleans the raw data and provides basic descriptive analyses from the Alive & Thrive breastfeeding program in Vietnam.

## Read in the excel file: "CoE mother survey data_Shared with McGill_Updated until 1st quarter of 2020.xlsx"
```{r setup, include=FALSE}
setwd("~/Dropbox/A&T study of CoE/data/")

library(tidyverse)
library(openxlsx)

#read data into r
df <- read.xlsx("CoE mother survey data_Shared with McGill_Updated until 1st quarter of 2020.xlsx")
str(df)
```

## This next section is for general cleaning and data restructuring
This outputs 'alive_thrive_clean.csv', which should be the basic starting point for future analysis. More covariates were created (enrollment date, status, and other covariates used for regression analysis are created in subsequent sections and are included in the 'alive_thrive_clean_w_covars.csv' file). For analyses of or with these covariates, please be sure to use the 'alive_thrive_clean_w_covars.csv' file.
```{r}
#create unique for each row (which represents a unique birth)
id <- c(1:nrow(df))
#create a new copy of df with unique id. All work will be done in df1 to preserve the original df
df1 <- cbind(id=id,df)

#recode all the 2-no to 0-no
df1$Q02[df1$Q02==2] <- 0
df1$Q04[df1$Q04==2] <- 0

#distinguishing missing in Q09 from 0: if Q09a not missing, then Q09a1-Q09a9 are 0 if not flagged
df1$Q09a1[(!is.na(df1$Q09a)) & is.na(df1$Q09a1)] <- 0
df1$Q09a2[(!is.na(df1$Q09a)) & is.na(df1$Q09a2)] <- 0
df1$Q09a3[(!is.na(df1$Q09a)) & is.na(df1$Q09a3)] <- 0
df1$Q09a4[(!is.na(df1$Q09a)) & is.na(df1$Q09a4)] <- 0
df1$Q09a5[(!is.na(df1$Q09a)) & is.na(df1$Q09a5)] <- 0
df1$Q09a7[(!is.na(df1$Q09a)) & is.na(df1$Q09a7)] <- 0
df1$Q09a8[(!is.na(df1$Q09a)) & is.na(df1$Q09a8)] <- 0
df1$Q09a9[(!is.na(df1$Q09a)) & is.na(df1$Q09a9)] <- 0

#For the free text in Q09a, recode to Q09a1-Q09a9
#removing leading and trailing spaces
df1$Q09a <- trimws(df1$Q09a)
#Lack or delay of milk supply
df1$Q09a1[df1$Q09a=="Lack of human milk supply"] <- 1
df1$Q09a1[df1$Q09a=="Delay in human milk supply"] <- 1
df1$Q09a1[df1$Q09a=="Lack of human milk supply in the first day"] <- 1
df1$Q09a1[df1$Q09a=="Lack of human milk supply so baby was hungry"] <- 1
df1$Q09a1[df1$Q09a=="Mother didn't have milk"] <- 1

#How does c-section affect breastmilk production?
#In this case, which is more important "delay" or "c-section"? Maybe both?
df1$Q09a2[df1$Q09a=="Delay in human milk supply because of c-section"] <- 1
df1$Q09a2[df1$Q09a=="Delay in human milk supply because of c-section delivery"] <- 1
df1$Q09a2[df1$Q09a=="Delay in human milk supply because of c-section delivery and giving birth to twins"] <- 1
df1$Q09a2[df1$Q09a=="Lack of human milk supply because of c-section delivery"] <- 1

#Mother is sick
df1$Q09a3[df1$Q09a=="Mother has hepatitis B"] <- 1
df1$Q09a3[df1$Q09a=="Mother was in emergency room because of haemorrhage"] <- 1

#recode Q9a to corresponding options in Q9a1-Q9a9
df1$Q09a4[df1$Q09a=="Baby got infection"] <- 1
df1$Q09a4[df1$Q09a=="Preterm birth"] <- 1

#Other reasons: 
df1$Q09a7[df1$Q09a=="Family worried that mother wouldn't have milk"] <- 1
df1$Q09a7[df1$Q09a=="Low birth weight"] <- 1
df1$Q09a7[df1$Q09a=="Mother didn't know which kind of milk is good for baby"] <- 1
df1$Q09a7[df1$Q09a=="Mother worried that her milk wasn't enough for baby"] <- 1
df1$Q09a7[df1$Q09a=="Worry about lack of milk"] <- 1

#rename the Q10/0-Q10/9 variables to Q10a0-Q10a9
df1 <- df1 %>% rename(Q10a0=`Q10/0`
                      ,Q10a1=`Q10/1`
                      ,Q10a2=`Q10/2`
                      ,Q10a3=`Q10/3`
                      ,Q10a4=`Q10/4`
                      ,Q10a5=`Q10/5`
                      ,Q10a6=`Q10/6`
                      ,Q10a9=`Q10/9`)

#if Q10a0 is 1 (none of the above) and Q10a1-Q10a6 is missing, then Q10a1 to Q10a9 is 0
#could also be written into a loop, but I don't understand R
df1$Q10a1 <- ifelse(df1$Q10a0==1 & is.na(df1$Q10a1),0,df1$Q10a1)
df1$Q10a2 <- ifelse(df1$Q10a0==1 & is.na(df1$Q10a2),0,df1$Q10a2)
df1$Q10a3 <- ifelse(df1$Q10a0==1 & is.na(df1$Q10a3),0,df1$Q10a3)
df1$Q10a4 <- ifelse(df1$Q10a0==1 & is.na(df1$Q10a4),0,df1$Q10a4)
df1$Q10a5 <- ifelse(df1$Q10a0==1 & is.na(df1$Q10a5),0,df1$Q10a5)
df1$Q10a6 <- ifelse(df1$Q10a0==1 & is.na(df1$Q10a6),0,df1$Q10a6)
df1$Q10a9 <- ifelse(df1$Q10a0==1 & is.na(df1$Q10a9),0,df1$Q10a9)

#if any of the Q10a1 to Q10a6 within an observation is not missing, then the columns that NA is recoded as 0
df1$Q10a1 <- ifelse(((!is.na(df1$Q10a2))|(!is.na(df1$Q10a3))|(!is.na(df1$Q10a4))|(!is.na(df1$Q10a5))|(!is.na(df1$Q10a6))|(!is.na(df1$Q10a9)))&(is.na(df1$Q10a1))
                    ,0
                    ,df1$Q10a1)
df1$Q10a2 <- ifelse(((!is.na(df1$Q10a1))|(!is.na(df1$Q10a3))|(!is.na(df1$Q10a4))|(!is.na(df1$Q10a5))|(!is.na(df1$Q10a6))|(!is.na(df1$Q10a9)))&is.na(df1$Q10a2)
                    ,0
                    ,df1$Q10a2)
df1$Q10a3 <- ifelse(((!is.na(df1$Q10a1))|(!is.na(df1$Q10a2))|(!is.na(df1$Q10a4))|(!is.na(df1$Q10a5))|(!is.na(df1$Q10a6))|(!is.na(df1$Q10a9)))&(is.na(df1$Q10a3))
                    ,0
                    ,df1$Q10a3)
df1$Q10a4 <- ifelse(((!is.na(df1$Q10a1))|(!is.na(df1$Q10a3))|(!is.na(df1$Q10a2))|(!is.na(df1$Q10a5))|(!is.na(df1$Q10a6))|(!is.na(df1$Q10a9)))&(is.na(df1$Q10a4))
                    ,0
                    ,df1$Q10a4)
df1$Q10a5 <- ifelse(((!is.na(df1$Q10a1))|(!is.na(df1$Q10a3))|(!is.na(df1$Q10a2))|(!is.na(df1$Q10a4))|(!is.na(df1$Q10a6))|(!is.na(df1$Q10a9)))&(is.na(df1$Q10a5))
                    ,0
                    ,df1$Q10a5)
df1$Q10a6 <- ifelse(((!is.na(df1$Q10a1))|(!is.na(df1$Q10a3))|(!is.na(df1$Q10a2))|(!is.na(df1$Q10a4))|(!is.na(df1$Q10a5))|(!is.na(df1$Q10a9)))&(is.na(df1$Q10a6))
                    ,0
                    ,df1$Q10a6)
df1$Q10a9 <- ifelse(((!is.na(df1$Q10a1))|(!is.na(df1$Q10a3))|(!is.na(df1$Q10a2))|(!is.na(df1$Q10a4))|(!is.na(df1$Q10a5))|(!is.na(df1$Q10a6)))&(is.na(df1$Q10a9))
                    ,0
                    ,df1$Q10a9)
#if any of the Q10a1-Q10a9 is 1, then Q10a0 is 0
df1$Q10a0 <- ifelse((df1$Q10a1==1)|(df1$Q10a2==1)|(df1$Q10a3==1)|(df1$Q10a4==1)|(df1$Q10a5==1)|(df1$Q10a6==1)|(df1$Q10a9==1)
                    ,0
                    ,df1$Q10a0)

write.csv(df1,'alive_thrive_clean_20200920.csv')
```

## Create variables for enrollment dates and status (includes private hospital status)
As mentioned before, the following code create the 'alive_thrive_clean_w_covars.csv' file. For all analyses of or with the covariates, please use the dataset with the covariates.
```{r}
#creating enrollment variable. Of note, hospital 1 was withdrawn

table(df1$Hospital)
#hospital 1 withdrew from enrollment, leaving it's enrollment date as NA

df1$enroll_date <- ifelse(df1$Hospital == 1, NA, ifelse(df1$Hospital ==2, "02/04/2019", ifelse(df1$Hospital == 3, "01/04/2019",ifelse(
  df1$Hospital == 4, "03/04/2019", ifelse(df1$Hospital == 5, "04/04/2019", ifelse(df1$Hospital == 6, "05/04/2019", ifelse(df1$Hospital == 7, "24/03/2019", ifelse(df1$Hospital == 8, "25/03/2019", ifelse(df1$Hospital ==  9, "26/03/2019", ifelse(df1$Hospital == 10, "27/03/2019", ifelse(df1$Hospital == 11, "13/04/2019", ifelse(df1$Hospital == 12, "16/03/2019", ifelse(df1$Hospital == 13, "17/03/2019", ifelse(df1$Hospital == 14, "16/03/2019", ifelse(df1$Hospital == 15, "15/03/2019", ifelse(df1$Hospital == 16, "14/03/2019", ifelse(df1$Hospital == 17, "15/03/2019", ifelse(df1$Hospital == 18, "07/06/2019", ifelse(df1$Hospital == 19, "16/05/2019", ifelse(df1$Hospital == 20, "16/05/2019", ifelse(df1$Hospital == 21, "17/05/2019", ifelse(df1$Hospital == 22, "17/05/2019", ifelse(df1$Hospital == 23, "11/02/2020", ifelse(df1$Hospital == 24, "10/02/2020", ifelse(df1$Hospital == 25, "26/07/2019", ifelse(df1$Hospital == 26, "25/07/2019", ifelse(df1$Hospital == 27, "19/07/2019", ifelse(df1$Hospital == 28, "18/07/2019", ifelse(df1$Hospital == 29, "24/05/2019", NA)))))))))))))))))))))))))))))

df1$enroll_date <- as.Date(df1$enroll_date, "%d/%m/%Y")
table(df1$enroll_date)

table(df1$Bmonth)
table(df1$Byear)

sum(table(df1$Bmonth))
sum(table(df1$Byear))

#one entry missing birth month and year information -- making NA

df1$dob <- paste("15", df1$Bmonth, df1$Byear, sep = "/")
table(df1$dob)

df1$dob <- ifelse(df1$dob == "15/NA/NA", NA, df1$dob)
table(df1$dob)

df1$dob <- as.Date(df1$dob, "%d/%m/%Y")
table(df1$dob)

#note: R counts dates as the number of days since 1/1/1960, so if a date if we're looking for enrollment date
#to come before dob we use "<" to signify this, as the number of days will be fewer since 1960 as it came beforehand.

df1$enrolled <- ifelse(df1$enroll_date <= df1$dob, 1, 0)
table(df1$enrolled)


#making designation variable variable
#using list of CoE hospitals document
table(df$Hospital)

#first using binary yes/no designated variable
df1$designation <- ifelse(df1$Hospital == 3, 1, ifelse(df1$Hospital == 7, 1, ifelse(df1$Hospital == 11, 1, ifelse(
df1$Hospital == 12, 1, ifelse(df1$Hospital == 18, 1, ifelse(df1$Hospital == 20, 1, ifelse(df1$Hospital == 25, 1, ifelse(df1$Hospital == 26, 1, ifelse(df1$Hospital == 28, 1, 0))))))))) 
  
table(df1$designation)

#designation date variable

df1$designate_date <- ifelse(df1$Hospital == 3, "01/08/2019", ifelse(df1$Hospital == 7, "01/04/2020", ifelse(df1$Hospital == 11, "01/10/2019", ifelse(df1$Hospital == 12, "01/01/2020", ifelse(df1$Hospital == 18, "01/10/2019", ifelse(df1$Hospital == 20, "01/01/2020", ifelse(df1$Hospital == 25, "01/08/2019", ifelse(df1$Hospital == 26, "01/01/2020",ifelse(df1$Hospital == 28, "01/11/2020", NA)))))))))
df1$designate_date <- as.Date(df1$designate_date, "%d/%m/%Y")
table(df1$designate_date)

df1$designated <- ifelse(df1$designate_date <= df1$dob, 1, 0)
table(df1$designated)

#making public/private variable
df1$private <- ifelse(df1$Hospital == 14, 1, ifelse(df1$Hospital == 17, 1, ifelse(df1$Hospital == 19, 1, ifelse(df1$Hospital == 26, 1, 0))))
```

## Create plot to get a sense of how to utilize enrollment status variable
For more information regarding time lag, check alive_thrive_aim2.Rmd
```{r}
#creating graph of births by enrollment status

library(ggplot2)

point_plot <- df1 %>%
   filter(!is.na(enrolled)) %>%
     ggplot(aes(dob, id, color = factor(enrolled))) + geom_point() + labs(title = "enrolled at birth")

bar_plot <- df1 %>% 
  filter(!is.na(enrolled)) %>%
    ggplot(aes(x = dob, fill = factor(enrolled))) + geom_bar()


point_plot
bar_plot

#df2 <- df1

#df2 <- df2 %>%
#   filter(!is.na(enrolled))

#ggplot(df2, aes(x = dob)) + 
#  geom_line(aes(y = enrolled)) + geom_line(aes(y = designated))
     
               
```

## Create covariates for regression analysis
```{r}
#creating binary yes/no versions that exclude "don't know" for composite score
df1$excl_bf <- ifelse(df1$Q02a == 1, 1, 0)
df1$skin <- ifelse(df1$Q04 == 1, 1, 0)
df1$bf_90 <- ifelse(df1$Q06 == 1, 1, 0)
#reverse coding formula score -- 1 if they were *not* given formula
df1$no_formula <- ifelse(df1$Q07 == 0, 1, 0)
df1$counseling <- ifelse(df1$Q08 == 1, 1, 0)
#recoding counseling formula -- 1 if they were *not* counseled to use formula
df1$counsel_formula <- ifelse(df1$Q09 == 0, 1, 0)

#creating c-section variable
df1$section <- ifelse(df1$Q03 == 2, 1, 0)

#creating Kinh/non-Kinh ethnicity variable
df1$kinh <- ifelse(df1$Q12 == 1, 1, 0)

#recoding Q02 as numeric (is s/he breastfed)
df1$Q02 <- as.numeric(df1$Q02)

#creating pre-term variable (created but not used)
df1$preterm <- ifelse(df1$Q03a == 1, 1, 0)

#creating seeing advertisement for forumula (created but not used)
df1$advert <- ifelse(df1$Q10a0 == 1, 1, 0)

#composite score outcome (created but not used)
df1$bf_score <- df1$Q02 + df1$excl_bf + df1$skin + df1$bf_90 + df1$no_formula + df1$counseling + df1$counsel_formula

write.csv(df1,'alive_thrive_clean_w_covars_20200920.csv')
```


